{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ef4e72b1",
   "metadata": {},
   "source": [
    "Interannotation agreement, all genre labels, YT\n",
    "percent agreement per genre; cohen's kappa, though falsely assumes independence; krippendorff's for whole labeling process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7847e5da",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "Tess = pd.read_excel(\"YT_grouplabels_Tess.xlsx\", sheet_name=\"YT_grouplabels_Tess\")\n",
    "Tan = pd.read_excel(\"YT_grouplabels_Tan.xlsx\", sheet_name=\"yt_grouplabels_set_with_urls\")\n",
    "\n",
    "print(\"Tess Dataset:\")\n",
    "print(Tess.head())\n",
    "\n",
    "print(\"\\nTan Dataset:\")\n",
    "print(Tan.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de1fc95d",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "Tess.drop(Tess.columns[[27, 28]], axis=1, inplace=True)\n",
    "\n",
    "print(\"Tess Dataset after removing blank columns:\")\n",
    "print(Tess.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c74b1c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#find drop deadlink columns\n",
    "tess_x_rows = Tess[Tess.iloc[:, 15] == 'x']\n",
    "tan_x_rows = Tan[Tan.iloc[:, 15] == 'x']\n",
    "\n",
    "print(\"Indices with 'x' in Tess:\")\n",
    "print(tess_x_rows.index.tolist())\n",
    "\n",
    "print(\"\\nIndices with 'x' in Tan:\")\n",
    "print(tan_x_rows.index.tolist())\n",
    "\n",
    "indices_match = tess_x_rows.index.equals(tan_x_rows.index)\n",
    "print(f\"\\nDo the indices match between Tess and Tan? {indices_match}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "826a6741",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "Tess_cleaned = Tess.drop(tess_x_rows.index)\n",
    "Tan_cleaned = Tan.drop(tan_x_rows.index)\n",
    "\n",
    "print(\"Tess shape after removal:\", Tess_cleaned.shape)\n",
    "print(\"Tan shape after removal:\", Tan_cleaned.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dae61713",
   "metadata": {},
   "outputs": [],
   "source": [
    "#clean stabdardize column names\n",
    "tess_columns = Tess_cleaned.columns[16:45]\n",
    "tan_columns = Tan_cleaned.columns[16:45]\n",
    "\n",
    "column_titles_match = tess_columns.equals(tan_columns)\n",
    "\n",
    "print(\"Column titles in Tess (16-44):\")\n",
    "print(tess_columns.tolist())\n",
    "\n",
    "print(\"\\nColumn titles in Tan (16-44):\")\n",
    "print(tan_columns.tolist())\n",
    "\n",
    "print(f\"\\nDo the column titles match between Tess and Tan? {column_titles_match}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d717d3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "column_range = range(16, 45)\n",
    "\n",
    "def standardize_labels(df):\n",
    "    for col in df.columns[column_range]:\n",
    "        # Find all variations of '1' and convert them to integer 1, others to 0\n",
    "        df[col] = df[col].apply(lambda x: 1 if str(x).strip() in ['1', '1.0', '1'] else 0)\n",
    "    return df\n",
    "\n",
    "Tess_cleaned = standardize_labels(Tess_cleaned)\n",
    "Tan_cleaned = standardize_labels(Tan_cleaned)\n",
    "\n",
    "def print_counts(df, df_name):\n",
    "    for col in df.columns[column_range]:\n",
    "        counts = df[col].value_counts()\n",
    "        print(f\"Column '{col}' in {df_name} - Counts of 1s and 0s:\")\n",
    "        print(counts)\n",
    "        print()\n",
    "\n",
    "# counts\n",
    "print(\"Counts in Tess_cleaned:\")\n",
    "print_counts(Tess_cleaned, \"Tess_cleaned\")\n",
    "\n",
    "print(\"Counts in Tan_cleaned:\")\n",
    "print_counts(Tan_cleaned, \"Tan_cleaned\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8e79906",
   "metadata": {},
   "outputs": [],
   "source": [
    "#cohens kappa; percent agree\n",
    "\n",
    "from sklearn.metrics import cohen_kappa_score\n",
    "\n",
    "def calculate_agreement(Tan_df, Tess_df):\n",
    "    results = []\n",
    "    for col in Tan_df.columns[column_range]:\n",
    "        tan_labels = Tan_df[col]\n",
    "        tess_labels = Tess_df[col]\n",
    "\n",
    "        tan_1s = tan_labels.sum()\n",
    "        tess_1s = tess_labels.sum()\n",
    "        tan_0s = len(tan_labels) - tan_1s\n",
    "        tess_0s = len(tess_labels) - tess_1s\n",
    "\n",
    "        percent_agreement = (tan_labels == tess_labels).mean() * 100\n",
    "\n",
    "        kappa = cohen_kappa_score(tan_labels, tess_labels)\n",
    "\n",
    "        results.append({\n",
    "            'Column': col,\n",
    "            'Tan 1s': tan_1s,\n",
    "            'Tess 1s': tess_1s,\n",
    "            'Tan 0s': tan_0s,\n",
    "            'Tess 0s': tess_0s,\n",
    "            'Percent Agreement': percent_agreement,\n",
    "            'Cohens Kappa': kappa\n",
    "        })\n",
    "    \n",
    "    return results\n",
    "\n",
    "agreement_results = calculate_agreement(Tan_cleaned, Tess_cleaned)\n",
    "\n",
    "for result in agreement_results:\n",
    "    print(f\"Column: {result['Column']}\")\n",
    "    print(f\"Tan 1s: {result['Tan 1s']}, Tess 1s: {result['Tess 1s']}\")\n",
    "    print(f\"Tan 0s: {result['Tan 0s']}, Tess 0s: {result['Tess 0s']}\")\n",
    "    print(f\"Percent Agreement: {result['Percent Agreement']:.2f}%\")\n",
    "    print(f\"Cohens Kappa: {result['Cohens Kappa']:.4f}\")\n",
    "    print(\"-\" * 40)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8eee2329",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Krippendorf "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "717fe395",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "tess_genre_columns = Tess_cleaned.iloc[:, [16, 21, 22, 23, 26, 27, 30, 32, 33, 34, 38, 39, 40, 42, 43, 44]]\n",
    "Tess_genres = tess_genre_columns.copy()\n",
    "\n",
    "\n",
    "tan_genre_columns = Tan_cleaned.iloc[:, [16, 21, 22, 23, 26, 27, 30, 32, 33, 34, 38, 39, 40, 42, 43, 44]]\n",
    "Tan_genres = tan_genre_columns.copy()\n",
    "\n",
    "print(\"Tess Genres:\")\n",
    "print(Tess_genres.head())\n",
    "\n",
    "print(\"\\nTan Genres:\")\n",
    "print(Tan_genres.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1afd882",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "genre_columns = [f'G{i}' for i in range(1, 17)] \n",
    "\n",
    "Tess_genres.columns = genre_columns\n",
    "Tan_genres.columns = genre_columns\n",
    "\n",
    "def get_selected_genres(row):\n",
    "    return [col for col in row.index if row[col] == 1]\n",
    "\n",
    "Tess_labels = Tess_genres.apply(get_selected_genres, axis=1)\n",
    "Tan_labels = Tan_genres.apply(get_selected_genres, axis=1)\n",
    "\n",
    "new_df = pd.DataFrame({\n",
    "    'Tess_labels': Tess_labels,\n",
    "    'Tan_labels': Tan_labels\n",
    "})\n",
    "\n",
    "print(new_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18ed6d72",
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import combinations\n",
    "\n",
    "genres = [f'G{i}' for i in range(1, 17)]  \n",
    "\n",
    "combinations_list = [(genre,) for genre in genres] \n",
    "\n",
    "\n",
    "combinations_list += [tuple(sorted(pair)) for pair in combinations(genres[:15], 2)]  \n",
    "\n",
    "combination_mapping = {comb: idx for idx, comb in enumerate(combinations_list)}\n",
    "\n",
    "total_combinations = len(combination_mapping)\n",
    "sample_combinations = list(combination_mapping.items())[:20] \n",
    "\n",
    "print(f\"Total combinations: {total_combinations}\")\n",
    "for combination, identifier in sample_combinations:\n",
    "    print(f\"Combination: {combination}, Identifier: {identifier}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c6ed0e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def encode_genre_selection(selection, mapping):\n",
    "    selection_tuple = tuple(sorted(selection))\n",
    "    return mapping.get(selection_tuple, -1)\n",
    "\n",
    "new_df['Tess_encoded'] = new_df['Tess_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "new_df['Tan_encoded'] = new_df['Tan_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "\n",
    "print(new_df[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']].head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e54df36b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#find -1s where no mapping was possible because no new label beyond prank was given"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5706ecb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "rows_with_minus_one = new_df[(new_df['Tess_encoded'] == -1) | (new_df['Tan_encoded'] == -1)]\n",
    "\n",
    "count_minus_one = rows_with_minus_one.shape[0]\n",
    "\n",
    "print(f\"Number of rows with -1 in either Tess_encoded or Tan_encoded: {count_minus_one}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d115b6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove rows with -1 in either Tess_encoded or Tan_encoded\n",
    "cleaned_new_df = new_df[(new_df['Tess_encoded'] != -1) & (new_df['Tan_encoded'] != -1)]\n",
    "\n",
    "rows_removed = new_df.shape[0] - cleaned_new_df.shape[0]\n",
    "print(f\"Number of rows removed: {rows_removed}\")\n",
    "\n",
    "print(cleaned_new_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b1c2be4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import krippendorff\n",
    "\n",
    "reliability_data = [\n",
    "    cleaned_new_df['Tess_encoded'].values,\n",
    "    cleaned_new_df['Tan_encoded'].values\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75b95285",
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha = krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')\n",
    "\n",
    "print(f\"Krippendorff's Alpha for the cleaned data: {alpha}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7c8b498",
   "metadata": {},
   "outputs": [],
   "source": [
    "#now let's calculate as if -1 was a selection (i.e., including prank labeling as part of process)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a738ece4",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "genre_columns = [f'G{i}' for i in range(1, 17)]  \n",
    "\n",
    "Tess_genres.columns = genre_columns\n",
    "Tan_genres.columns = genre_columns\n",
    "\n",
    "def get_selected_genres(row):\n",
    "    return [col for col in row.index if row[col] == 1]\n",
    "\n",
    "Tess_labels = Tess_genres.apply(get_selected_genres, axis=1)\n",
    "Tan_labels = Tan_genres.apply(get_selected_genres, axis=1)\n",
    "\n",
    "new_df = pd.DataFrame({\n",
    "    'Tess_labels': Tess_labels,\n",
    "    'Tan_labels': Tan_labels\n",
    "})\n",
    "\n",
    "print(new_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e6657bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "genres = [f'G{i}' for i in range(1, 17)] \n",
    "\n",
    "combinations_list = [(genre,) for genre in genres] \n",
    "\n",
    "combinations_list += [tuple(sorted(pair)) for pair in combinations(genres[:15], 2)]  # Identifiers 16 to 120\n",
    "\n",
    "combination_mapping = {comb: idx for idx, comb in enumerate(combinations_list)}\n",
    "\n",
    "total_combinations = len(combination_mapping)\n",
    "sample_combinations = list(combination_mapping.items())[:20]\n",
    "\n",
    "print(f\"Total combinations: {total_combinations}\")\n",
    "for combination, identifier in sample_combinations:\n",
    "    print(f\"Combination: {combination}, Identifier: {identifier}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8558e3b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def encode_genre_selection(selection, mapping):\n",
    "    selection_tuple = tuple(sorted(selection))\n",
    "    return mapping.get(selection_tuple, -1)\n",
    "\n",
    "new_df['Tess_encoded'] = new_df['Tess_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "new_df['Tan_encoded'] = new_df['Tan_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "\n",
    "print(new_df[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a98007a",
   "metadata": {},
   "outputs": [],
   "source": [
    "rows_with_minus_one = new_df[(new_df['Tess_encoded'] == -1) | (new_df['Tan_encoded'] == -1)]\n",
    "\n",
    "count_minus_one = rows_with_minus_one.shape[0]\n",
    "\n",
    "print(f\"Number of rows with -1 in either Tess_encoded or Tan_encoded: {count_minus_one}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8168e51a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "reliability_data = [\n",
    "    new_df['Tess_encoded'].values,\n",
    "    new_df['Tan_encoded'].values\n",
    "]\n",
    "\n",
    "alpha = krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')\n",
    "\n",
    "print(f\"Krippendorff's Alpha including -1 as its own category: {alpha}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7219a52f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
